Analysis 1

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(broom)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
library(metR)
## 
## Attaching package: 'metR'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
london_weekday = read.csv('london_weekdays.csv')
london_weekend = read.csv('london_weekends.csv')

london_weekday <- london_weekday[order(london_weekday$room_type), ]
london_weekday$room_type <- as.numeric(as.factor(london_weekday$room_type))
london_weekday$room_shared = ifelse(london_weekday$room_shared == "False", 0, 1)
london_weekday$room_private = ifelse(london_weekday$room_private == "False", 0, 1)
london_weekday$host_is_superhost = ifelse(london_weekday$host_is_superhost == "False", 0, 1)

london_weekend <- london_weekend[order(london_weekend$room_type), ]
london_weekend$room_type <- as.numeric(as.factor(london_weekend$room_type))
london_weekend$room_shared = ifelse(london_weekend$room_shared == "False", 0, 1)
london_weekend$room_private = ifelse(london_weekend$room_private == "False", 0, 1)
london_weekend$host_is_superhost = ifelse(london_weekend$host_is_superhost == "False", 0, 1)
#cwd <- combined %>% 
#  filter(day_type == "Weekday") 
#cwd <- cwd %>%
#  select(-X, -room_type, -room_shared, -room_private, -host_is_superhost, -day_type)

corr_cwd <- cor(london_weekday)
corr_cwd <- reshape2::melt(corr_cwd)

ggplot(corr_cwd %>% filter(Var2 == "realSum" & Var1 != "realSum"), aes(Var1, Var2, fill = value, label = round(value, 2))) +
  geom_tile(color = "white") +
  geom_text(color = "white", size = 4) +
  scale_fill_gradient2(low = "#440154", mid = "#21918C", high = "#FDE725", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Correlation") +
  ggtitle("Correlation Plot for Price of Listings") +
  xlab("Variables") + ylab("Price of Listing") +
  labs(subtitle = "Weekday Data") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) +
  coord_fixed()

#cwe <- combined %>% 
#  filter(day_type == "Weekend") %>%
#  select(-X, -room_type, -room_shared, -room_private, -host_is_superhost, -day_type)

corr_cwe <- cor(london_weekend)
corr_cwe <- reshape2::melt(corr_cwe)

ggplot(corr_cwe %>% filter(Var2 == "realSum" & Var1 != "realSum"), aes(Var1, Var2, fill = value, label = round(value, 2))) +
  geom_tile(color = "white") +
  geom_text(color = "white", size = 4) +
  scale_fill_gradient2(low = "#440154", mid = "#21918C", high = "#FDE725", midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Correlation") +
  ggtitle("Correlation Plot for Price of Listings") +
  xlab("Variables") + ylab("Price of Listing") +
  labs(subtitle = "Weekend Data") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) +
  coord_fixed()

london_weekday = read.csv('london_weekdays.csv')
london_weekend = read.csv('london_weekends.csv')
london_weekday$day_type = 'Weekday'
london_weekend$day_type = 'Weekend'

combined <- rbind(london_weekday, london_weekend)
ggplot(data = combined) +
  geom_boxplot(aes(y = realSum)) +
  ggtitle("Box Plot of Listing Price") +
  xlab("") + ylab("Price per Night (EUR)") +
  labs(subtitle = "Price Variable Outliers") +
  theme_classic()

ggplot(data = combined) +
  geom_boxplot(aes(y = dist)) +
  ggtitle("Box Plot of Listing's Distance from City Center") +
  xlab("") + ylab("Distance (km)") +
  labs(subtitle = "Distance Variable Outliers") +
  theme_classic()

ggplot(data = combined) +
  geom_bar(aes(x = room_type, y = after_stat(count), fill = room_type)) +
  ggtitle("Bar Plot of Room Type") +
  xlab("Room Type") + ylab("Count") +
  labs(subtitle = "Type of Listing") +
  scale_fill_viridis_d() +
  theme_classic()

ggplot(data = combined) +
  geom_bar(aes(x = person_capacity, y = after_stat(count), fill = as.factor(person_capacity))) +
  ggtitle("Bar Plot of Person Capacity") +
  xlab("Person Capacity") + ylab("Count") +
  labs(subtitle = "Room Capacity") +
  scale_fill_viridis_d(name = "Person Capacity") +
  theme_classic()

london_cleaned <- combined %>%
  filter(room_type != "Shared room") %>%
  filter(dist < 13) %>%
  filter(realSum < 1000) %>%
  select(day_type, lng, lat, room_type, person_capacity, dist, realSum) %>%
  mutate(person_capacity = ifelse(person_capacity == 6, 5, as.integer(person_capacity)))
ggplot() +
  geom_density(data = london_cleaned, aes(x = realSum, after_stat(count), fill = day_type, group = day_type), alpha = 0.4) +
  scale_fill_viridis_d(name = "Part of the Week", direction = -1) +
  ggtitle("Density Plot for Price per Night") +
  xlab("Price per Night (Pounds)") + ylab("Count") +
  labs(subtitle = "Weekday and Weekend Data") +
  theme_classic()

ggplot(data = london_cleaned, aes(x = dist, y = realSum, color = as.factor(person_capacity))) + 
  geom_point(size = 2, shape = 16, alpha = 0.35) +
  geom_smooth(method = gam, color = "black", alpha = 0.5) +
  facet_grid(day_type~room_type, labeller = label_both) +
  scale_color_viridis_d(name = "Room Capacity") +
  ggtitle("Scatter Plot of Combined Data") +
  xlab("Distance (km)") + ylab("Price (EUR)") +
  labs(subtitle = "Distance vs Price, Colored by Room Capacity, Faceted by Room Type and Day Type") +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

london_grouped <- london_cleaned %>%
  select(room_type, person_capacity, dist, realSum, day_type) %>%
  group_by(room_type, person_capacity, day_type) %>%
  summarize(avg_distance = mean(dist), 
            avg_price = mean(realSum),
            .groups = 'keep'
  )
ggplot(data = london_grouped) +
  geom_point(aes(x = person_capacity, y = avg_price, group = room_type, color = room_type)) + 
  geom_line(aes(x = person_capacity, y = avg_price, group = room_type, color = room_type)) +
  facet_grid(~day_type, labeller = label_both) +
  scale_color_viridis_d(name = "Room Type") +
  ggtitle("Line Plot of Average Price") +
  xlab("Room Capacity") + ylab("Average Price") +
  labs(subtitle = "Average Values for Price across Room Capacities\nColored by Room Type\nFaceted by Time of the Week") +
  theme_classic()

ggplot(data = london_grouped) +
  geom_point(aes(x = person_capacity, y = avg_distance, group = room_type, color = room_type)) + 
  geom_line(aes(x = person_capacity, y = avg_distance, group = room_type, color = room_type)) +
  facet_grid(~day_type, labeller = label_both) +
  scale_color_viridis_d(name = "Room Type") +
  ggtitle("Line Plot of Average Distance") +
  xlab("Room Capacity") + ylab("Average Distance") +
  labs(subtitle = "Average Values for Distance across Room Capacities\nColored by Room Type\nFaceted by Time of the Week") +
  theme_classic()

lwd <- london_cleaned %>%
  filter(day_type == "Weekday")
lwe <- london_cleaned %>%
  filter(day_type == "Weekend")

wdl <- lwd %>% select(lng, lat)
wdl <- wdl[!duplicated(wdl), ]
wel <- lwe %>% select(lng, lat)
wel <- wel[!duplicated(wel), ]

common_locations <- wdl %>%
  inner_join(wel, by = join_by(lng == lng, lat == lat))
london_weekday_common <- lwd %>%
  inner_join(common_locations, by = join_by(lng == lng, lat == lat)) %>%
  dplyr::rename("weekday_price" = "realSum")
london_weekday_common <- london_weekday_common[!duplicated(london_weekday_common[, c("lng", "lat")]), ]
london_weekend_common <- lwe %>%
  inner_join(common_locations, by = join_by(lng == lng, lat == lat)) %>%
  dplyr::rename("weekend_price" = "realSum")
london_weekend_common <- london_weekend_common[!duplicated(london_weekend_common[, c("lng", "lat")]), ]
london_common <- london_weekday_common %>%
  inner_join(london_weekend_common %>% select(lng, lat, weekend_price), by = join_by(lng == lng, lat == lat)) %>%
  select(-day_type)
london_common$price_higher[(london_common$weekday_price - london_common$weekend_price) > 0] <- "Weekdays"
london_common$price_higher[(london_common$weekday_price - london_common$weekend_price) == 0] <- "Same"
london_common$price_higher[(london_common$weekday_price - london_common$weekend_price) < 0] <- "Weekends"
london_common$ratio = london_common$weekday_price / london_common$weekend_price
head(london_common, 5)
##        lng      lat       room_type person_capacity     dist weekday_price
## 1 -0.16032 51.46531 Entire home/apt               2 5.301018      570.0981
## 2 -0.09683 51.50343    Private room               2 2.198946      297.9844
## 3 -0.10554 51.52407    Private room               2 2.322958      336.7906
## 4 -0.16575 51.46292    Private room               2 5.707825      226.7222
## 5 -0.12055 51.53728    Private room               3 3.257945      256.3560
##   weekend_price price_higher     ratio
## 1      567.0406     Weekdays 1.0053920
## 2      296.5733     Weekdays 1.0047581
## 3      335.1443     Weekdays 1.0049123
## 4      225.5462     Weekdays 1.0052138
## 5      281.0508     Weekends 0.9121339
ggplot(data = london_common, aes(x = (weekday_price - weekend_price), y = after_stat(count))) + 
  geom_density(fill = "#3B528B", alpha = 0.4) + 
  ggtitle("Density Plot for Price Difference on Weekday and Weekend Data") +
  xlab("Count") + ylab("Price Difference") +
  labs(subtitle = "Negative -> Price Higher on Weekends \nPositive -> Price Higher on Weekdays") +
  theme_classic()

api_key <- "AIzaSyCDXjJr2S8veUhq9yMttKfHTQTYtfSoJRA"
register_google(key = api_key)
london_map <- get_map(location = c(lon = -0.11, lat = 51.5), zoom = 11, color = "bw")
## ℹ <https://maps.googleapis.com/maps/api/staticmap?center=51.5,-0.11&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx>
ggmap(london_map)+
  geom_point(data = london_common, aes(x = lng, y = lat, color = price_higher, size = abs(weekday_price - weekend_price)), shape = 16, alpha = 0.5) + 
  scale_color_viridis_d() +
  xlab("") + 
  ylab("") +
  theme_classic()

ggplot(data = london_common, aes(x = dist, y = log(ratio), group = room_type, color = as.factor(person_capacity))) +
  geom_line() +
  facet_grid(room_type~person_capacity, labeller = label_both) +
  scale_color_viridis_d(direction = -1) +
  ggtitle("Line Plot for Price Ratio between Weekdays and Weekends") +
  xlab("Distance (km)") + ylab("Price Ratio") +
  labs(subtitle = "Ratio Value: \nNegative -> Price Higher on Weekends \nPositive -> Price Higher on Weekdays") +
  theme_minimal()

london_common_grouped <- london_common %>%
  select(room_type, person_capacity, dist, weekday_price, weekend_price) %>%
  group_by(room_type, person_capacity) %>%
  summarize(avg_dist = mean(dist), 
            avg_weekday_price = mean(weekday_price), 
            avg_weekend_price = mean(weekend_price), 
            .groups = 'keep'
  )
london_common_grouped
## # A tibble: 8 × 5
## # Groups:   room_type, person_capacity [8]
##   room_type       person_capacity avg_dist avg_weekday_price avg_weekend_price
##   <chr>                     <dbl>    <dbl>             <dbl>             <dbl>
## 1 Entire home/apt               2     4.85              386.              392.
## 2 Entire home/apt               3     4.73              417.              422.
## 3 Entire home/apt               4     4.80              504.              516.
## 4 Entire home/apt               5     4.90              585.              599.
## 5 Private room                  2     5.70              196.              203.
## 6 Private room                  3     4.86              231.              237.
## 7 Private room                  4     5.69              241.              256.
## 8 Private room                  5     5.20              335.              363.
ggplot(data = london_common_grouped, aes(x = person_capacity, y = round((avg_weekday_price / avg_weekend_price), 2), color = room_type)) + 
  geom_point() +
  geom_line() +
  facet_wrap(~room_type) + 
  scale_color_viridis_d(name = "Room Type") +
  ggtitle("Line Plot for Average Price Ratio between Weekdays and Weekends") +
  xlab("Person Capacity") + ylab("Average Price Ratio") +
  labs(subtitle = "Actual Data \nRatio Value: \nValue below 1 -> Price Higher on Weekends \nValue above 1 -> Price Higher on Weekdays") +
  theme_classic()

grid = expand.grid(person_capacity = 2:5, room_type = c("Entire home/apt", "Private room"), dist = seq(1, 13, 0.25))
gam_weekday <- gam(data = lwd, formula = realSum ~ s(dist) + room_type * person_capacity, method = "REML")
summary(gam_weekday)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## realSum ~ s(dist) + room_type * person_capacity
## 
## Parametric coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            215.638      8.740  24.673  < 2e-16 ***
## room_typePrivate room                  -52.573     12.568  -4.183 2.93e-05 ***
## person_capacity                         73.537      2.422  30.366  < 2e-16 ***
## room_typePrivate room:person_capacity  -53.057      4.580 -11.585  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 7.274  8.291 91.38  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.626   Deviance explained = 62.7%
## -REML =  27091  Scale est. = 14292     n = 4368
gam_weekend <- gam(data = lwe, formula = realSum ~ s(dist) + room_type * person_capacity, method = "REML")
summary(gam_weekend)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## realSum ~ s(dist) + room_type * person_capacity
## 
## Parametric coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            224.684      7.952  28.253  < 2e-16 ***
## room_typePrivate room                  -79.835     11.612  -6.875 6.93e-12 ***
## person_capacity                         68.805      2.224  30.939  < 2e-16 ***
## room_typePrivate room:person_capacity  -37.733      4.268  -8.842  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 6.608  7.753 114.2  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.604   Deviance explained = 60.5%
## -REML =  31498  Scale est. = 14384     n = 5076
lwd$weekday_pred = augment(gam_weekday)$.fitted
lwe$weekend_pred = augment(gam_weekend)$.fitted
ggplot(data = lwd) +
  geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
  geom_point(aes(x = dist, y = weekday_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
  facet_grid(room_type~person_capacity, labeller = label_both) +
  scale_color_viridis_d(name = "Price") +
  ggtitle("Plot to Show GAM Fit on Data") +
  xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
  labs(subtitle = "Weekday Data") +
  theme_minimal()

ggplot(data = lwe) +
  geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
  geom_point(aes(x = dist, y = weekend_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
  facet_grid(room_type~person_capacity, labeller = label_both) +
  scale_color_viridis_d(name = "Price") +
  ggtitle("Plot to Show GAM Fit on Data") +
  xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
  labs(subtitle = "Weekend Data") +
  theme_minimal()

predict1 = predict(gam_weekday, newdata = grid)
predict2 = predict(gam_weekend, newdata = grid)
result = data.frame(grid, weekday_price = as.vector(predict1), weekend_price = as.vector(predict2))
ggplot(data = result) +
  geom_point(aes(x = dist, y = weekday_price, color = "Weekday"), shape = 1, size = 0.5) +
  geom_point(aes(x = dist, y = weekend_price, color = "Weekend"), shape = 1, size = 0.5) +
  facet_grid(room_type~person_capacity, labeller = label_both) +
  scale_color_viridis_d(name = "Time of the Week") +
  ggtitle("Plot to Show GAM Predictions") +
  xlab("Distance (km)") + ylab("Predicted Price") +
  labs(subtitle = "Comparing Weekday and Weekend Trends") +
  theme_minimal()

result_grouped <- result %>%
  select(room_type, person_capacity, dist, weekday_price, weekend_price) %>%
  group_by(room_type, person_capacity) %>%
  summarize(avg_dist = mean(dist), 
            avg_weekday_price = mean(weekday_price), 
            avg_weekend_price = mean(weekend_price), 
            .groups = 'keep'
  )
ggplot(data = result_grouped, aes(x = person_capacity, y = round((avg_weekday_price / avg_weekend_price), 2), color = room_type)) + 
  geom_point() +
  geom_line() +
  facet_wrap(~room_type) + 
  scale_color_viridis_d(name = "Room Type") +
  ggtitle("Line Plot for Average Price Ratio between Weekdays and Weekends") +
  xlab("Person Capacity") + ylab("Average Price Ratio") +
  labs(subtitle = "GAM Predicted Data \nRatio Value: \nValue below 1 -> Price Higher on Weekends \nValue above 1 -> Price Higher on Weekdays") +
  theme_classic()

lwd <- lwd[order(lwd$room_type), ]
lwd$room_type_num <- as.numeric(as.factor(lwd$room_type))

lwe <- lwe[order(lwe$room_type), ]
lwe$room_type_num <- as.numeric(as.factor(lwe$room_type))

mapping <- lwd %>%
  select(room_type, room_type_num) %>%
  group_by(room_type, room_type_num)
mapping <- mapping[!duplicated(mapping[, c("room_type", "room_type_num")]), ]
loess_weekday <- loess(data = lwd, formula = realSum ~ dist * room_type_num * person_capacity, degree = 2, span = 0.5)
summary(loess_weekday)
## Call:
## loess(formula = realSum ~ dist * room_type_num * person_capacity, 
##     data = lwd, span = 0.5, degree = 2)
## 
## Number of Observations: 4368 
## Equivalent Number of Parameters: 18.8 
## Residual Standard Error: 118.1 
## Trace of smoother matrix: 22.94  (exact)
## 
## Control settings:
##   span     :  0.5 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE FALSE FALSE
## drop.square:  FALSE FALSE FALSE
loess_weekend <- loess(data = lwe, formula = realSum ~ dist * room_type_num * person_capacity, degree = 2, span = 0.5)
summary(loess_weekend)
## Call:
## loess(formula = realSum ~ dist * room_type_num * person_capacity, 
##     data = lwe, span = 0.5, degree = 2)
## 
## Number of Observations: 5076 
## Equivalent Number of Parameters: 19.18 
## Residual Standard Error: 118 
## Trace of smoother matrix: 23.43  (exact)
## 
## Control settings:
##   span     :  0.5 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE FALSE FALSE
## drop.square:  FALSE FALSE FALSE
lwd$weekday_pred = augment(loess_weekday)$.fitted
lwe$weekend_pred = augment(loess_weekend)$.fitted
ggplot(data = lwd) +
  geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
  geom_point(aes(x = dist, y = weekday_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
  facet_grid(room_type~person_capacity, labeller = label_both) +
  scale_color_viridis_d(name = "Price") +
  ggtitle("Plot to Show Loess Fit on Data") +
  xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
  labs(subtitle = "Weekday Data") + 
  theme_minimal()

ggplot(data = lwe) +
  geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
  geom_point(aes(x = dist, y = weekend_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
  facet_grid(room_type~person_capacity) +
  scale_color_viridis_d(name = "Price") +
  ggtitle("Plot to Show Loess Fit on Data") +
  xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
  labs(subtitle = "Weekend Data") + 
  theme_minimal()

grid = expand.grid(person_capacity = 2:5, room_type_num = c(1, 2), dist = seq(1, 13, 0.25))
predict1 = predict(loess_weekday, newdata = grid)
predict2 = predict(loess_weekend, newdata = grid)
result = data.frame(grid, weekday_price = as.vector(predict1), weekend_price = as.vector(predict2))
result <- result %>%
  inner_join(mapping, by = join_by(room_type_num == room_type_num))
result = na.omit(result)
ggplot(data = result) +
  geom_point(aes(x = dist, y = weekday_price, color = "Weekday Price"), shape = 1, size = 0.5) +
  geom_point(aes(x = dist, y = weekend_price, color = "Weekend Price"), shape = 1, size = 0.5) +
  facet_grid(room_type~person_capacity) +
  scale_color_viridis_d(name = "Time of the Week") +
  ggtitle("Plot to Show Loess Predictions") +
  xlab("Distance (km)") + ylab("Predicted Price") +
  labs(subtitle = "Comparing Weekday and Weekend Trends") +
  theme_minimal()

result_grouped <- result %>%
  select(room_type, person_capacity, dist, weekday_price, weekend_price) %>%
  group_by(room_type, person_capacity) %>%
  summarize(avg_dist = mean(dist), 
            avg_weekday_price = mean(weekday_price), 
            avg_weekend_price = mean(weekend_price), 
            .groups = 'keep'
  )
result_grouped
## # A tibble: 8 × 5
## # Groups:   room_type, person_capacity [8]
##   room_type       person_capacity avg_dist avg_weekday_price avg_weekend_price
##   <chr>                     <int>    <dbl>             <dbl>             <dbl>
## 1 Entire home/apt               2     6.88              343.              338.
## 2 Entire home/apt               3     6.88              385.              377.
## 3 Entire home/apt               4     6.88              465.              455.
## 4 Entire home/apt               5     6.88              566.              555.
## 5 Private room                  2     6.88              186.              188.
## 6 Private room                  3     6.88              197.              202.
## 7 Private room                  4     6.88              221.              240.
## 8 Private room                  5     6.88              273.              310.
ggplot(data = result_grouped, aes(x = person_capacity, y = round((avg_weekday_price / avg_weekend_price), 2), color = room_type)) + 
  geom_point() +
  geom_line() +
  facet_wrap(~room_type) + 
  scale_color_viridis_d(name = "Room Type") +
  ggtitle("Line Plot for Average Price Ratio between Weekdays and Weekends (Loess Predicted Data)") +
  xlab("Person Capacity") + ylab("Average Price Ratio") +
  labs(subtitle = "Ratio Value: \nValue below 1 -> Price Higher on Weekends \nValue above 1 -> Price Higher on Weekdays") +
  theme_classic()

gam_weekday <- gam(data = lwd, formula = realSum ~ s(dist) + person_capacity, method = "REML")
summary(gam_weekday)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## realSum ~ s(dist) + person_capacity
## 
## Parametric coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       28.206      6.042   4.668 3.13e-06 ***
## person_capacity  105.804      2.071  51.084  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 7.954  8.714 87.73  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.463   Deviance explained = 46.4%
## -REML =  27892  Scale est. = 20543     n = 4368
gam_weekend <- gam(data = lwe,  formula = realSum ~ s(dist) + person_capacity, method = "REML")
summary(gam_weekend)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## realSum ~ s(dist) + person_capacity
## 
## Parametric coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       41.011      5.529   7.417  1.4e-13 ***
## person_capacity  102.337      1.886  54.248  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 7.232   8.26 103.8  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.454   Deviance explained = 45.5%
## -REML =  32322  Scale est. = 19846     n = 5076
lwd$weekday_pred = augment(gam_weekday)$.fitted
lwe$weekend_pred = augment(gam_weekend)$.fitted
ggplot(data = lwd) +
  geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
  geom_point(aes(x = dist, y = weekday_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
  facet_grid(room_type~person_capacity, labeller = label_both) +
  scale_color_viridis_d(name = "Room Type") +
  ggtitle("Plot to Show GAM Fit on Data") +
  xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
  labs(subtitle = "Weekday Data") +
  theme_minimal()

ggplot(data = lwe) +
  geom_point(aes(x = dist, y = realSum, color = "Actual Price"), shape = 1, size = 0.5) +
  geom_point(aes(x = dist, y = weekend_pred, color = "Predicted Price"), shape = 1, size = 0.5) +
  facet_grid(room_type~person_capacity, labeller = label_both) +
  scale_color_viridis_d(name = "Price") +
  ggtitle("Plot to Show GAM Fit on Data") +
  xlab("Distance (km)") + ylab("Price of Listing (EUR)") +
  labs(subtitle = "Weekend Data") +
  theme_minimal()

Analysis 2

cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

Loading the data

weekdays = read.csv("london_weekdays.csv")
weekends <- read.csv("london_weekends.csv")
df <- rbind(weekends, weekdays)
df$day_type <- ifelse(seq_len(nrow(df)) <= 5379, "Weekend", "Weekday")
#statistics for weekday prices
summary(weekdays$realSum)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    64.68   167.46   256.36   360.23   435.45 15499.89
#statistics for weekend prices
summary(weekends$realSum)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    54.33   174.51   268.12   364.39   438.27 12937.27
#statistics for average distance from city center
summary(df$dist)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.04055  3.54913  4.92412  5.32642  6.83692 17.32121
ecdf(df$dist)(13)
## [1] 0.9792855
cat("Count of all Rooms:", nrow(df))
## Count of all Rooms: 9993
filtered_df <- df %>%
  filter(room_type == "Shared room")
count_shared_rooms <- nrow(filtered_df)
cat(" \nCount of Shared Rooms:", count_shared_rooms)
##  
## Count of Shared Rooms: 50
filtered_df <- df %>%
  filter(room_type == "Shared room" & person_capacity >3 )
count_shared_rooms <- nrow(filtered_df)
cat("Count of rows where room_type is 'Shared room' and person_capacity is greater than 3:", count_shared_rooms, "\n")
## Count of rows where room_type is 'Shared room' and person_capacity is greater than 3: 9
filtered_df <- df %>%
  filter(person_capacity == 0 )
listing_with_0_rooms <- nrow(filtered_df)
cat("Count of rows where person_capacity is equal to 1:", listing_with_0_rooms, "\n")
## Count of rows where person_capacity is equal to 1: 0
filtered_df <- df %>%
  filter(person_capacity == 1 )
listing_with_1_rooms <- nrow(filtered_df)
cat("Count of rows person_capacity is equal to 1:", listing_with_1_rooms, "\n")
## Count of rows person_capacity is equal to 1: 0
filtered_df <- df %>%
  filter(person_capacity == 2 )
listing_with_2_rooms <- nrow(filtered_df)
cat("Count of rows person_capacity is equal to 2:", listing_with_2_rooms, "\n")
## Count of rows person_capacity is equal to 2: 6207
filtered_df <- df %>%
  filter(dist > 13 )
listing_farther_than_13_miles <- nrow(filtered_df)
cat("Count of listing which are more than 13 mile from city center:", listing_farther_than_13_miles, "\n")
## Count of listing which are more than 13 mile from city center: 207
ggplot(df, aes(x = as.factor(person_capacity))) +
  geom_bar() +
  facet_grid(~ day_type) +
  labs(title = "Frequency of Person Capacity",
       x = "Person Capacity",
       y = "Frequency")

# ggplot(df, aes(x = as.factor(person_capacity))) +
#   geom_bar() +
#   facet_grid(~ room_type) +
#   labs(title = "Frequency of Person Capacity by Room Type",
#        x = "Person Capacity",
#        y = "Frequency")

ggplot(df, aes(x = as.factor(person_capacity))) +
  geom_bar() +
  facet_grid(room_type ~ day_type) +
  labs(title = "Frequency of Person Capacity by Room Type and Day Type",
       x = "Person Capacity",
       y = "Frequency")

ggplot(df, aes(x = realSum)) +
  geom_density(fill = "blue", alpha = 0.5) +
  facet_grid(~day_type) +
  labs(title = "Smooth Frequency Plot for real_sum",
       x = "real_sum",
       y = "Density")

ggplot(df, aes(x = dist)) +
  geom_density(fill = "blue", alpha = 0.5) +
  facet_grid(~day_type) +
  labs(title = "Smooth Frequency Plot for real_sum",
       x = "real_sum",
       y = "Density")

subset_df <- subset(df, realSum < 1000)
subset_df <- subset(subset_df, dist < 13)
subset_df <- subset(subset_df, room_type != "Shared room")
subset_df <- subset_df %>%
  mutate(person_capacity = ifelse(person_capacity == 6, 5, as.integer(person_capacity)))
subset_df <- na.omit(subset_df)
head(subset_df)
##   X  realSum       room_type room_shared room_private person_capacity
## 1 0 121.1223    Private room       False         True               2
## 2 1 195.9124    Private room       False         True               2
## 3 2 193.3253    Private room       False         True               3
## 4 3 180.3899    Private room       False         True               2
## 5 4 405.7010 Entire home/apt       False        False               3
## 6 5 354.1946 Entire home/apt       False        False               2
##   host_is_superhost multi biz cleanliness_rating guest_satisfaction_overall
## 1             False     0   0                  6                         69
## 2             False     1   0                 10                         96
## 3             False     1   0                 10                         95
## 4             False     1   0                  9                         87
## 5             False     0   1                  7                         65
## 6             False     0   1                  9                         93
##   bedrooms     dist metro_dist attr_index attr_index_norm rest_index
## 1        1 5.734117  0.4370940   222.8822        15.49341   470.0885
## 2        1 4.788905  1.4640505   235.3858        16.36259   530.1335
## 3        1 4.596677  0.4503062   268.9138        18.69325   548.9876
## 4        1 2.054769  0.1326705   472.3813        32.83707  1021.2711
## 5        0 4.491277  0.3541075   318.4915        22.13958   692.7754
## 6        0 4.467894  0.3507494   321.8646        22.37406   703.0686
##   rest_index_norm      lng      lat day_type
## 1        8.413765 -0.04975 51.52570  Weekend
## 2        9.488466 -0.08475 51.54210  Weekend
## 3        9.825922 -0.14585 51.54802  Weekend
## 4       18.278973 -0.10611 51.52108  Weekend
## 5       12.399473 -0.18797 51.49399  Weekend
## 6       12.583702 -0.18805 51.49473  Weekend

Plotting the data

api_key <- "AIzaSyCDXjJr2S8veUhq9yMttKfHTQTYtfSoJRA"
register_google(key = api_key)
london_map <- get_map(location = "London", zoom = 11)
## ℹ <https://maps.googleapis.com/maps/api/staticmap?center=London&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx>
## ℹ <https://maps.googleapis.com/maps/api/geocode/json?address=London&key=xxx>
ggmap(london_map)+
  geom_point(
    data = subset_df,
    aes(x = lng, y = lat, color = realSum),
    size = 1.3,
    alpha = 0.5
  ) +
  scale_color_gradient(name = "Price", low = "#fff200", high = "#301934") + xlab("") + ylab("") 

ggplot(subset_df, aes(x = dist, y = realSum)) +
  geom_point()+ #+ scale_x_log10() #+ scale_y_log10() +
  facet_grid(~day_type) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_smooth(method = "loess", col = "orange", method.args = list(degree = 1, family = "symmetric"), span = 0.5) #+
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

  #geom_smooth(method = "rlm", se = FALSE, col = "pink", method.args = list(psi = psi.bisquare)) 
ggplot(subset_df, aes(x = dist, y = log(realSum))) +
  geom_point()+ #+ scale_x_log10() #+ scale_y_log10() +
  facet_grid(~day_type) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_smooth(method = "loess", col = "orange", method.args = list(degree = 1, family = "symmetric"), span = 0.5) #+
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

  #geom_smooth(method = "rlm", se = FALSE, col = "pink", method.args = list(psi = psi.bisquare)) 
ggplot(subset_df, aes(x = realSum, fill = room_type)) +
  geom_histogram(binwidth = 50, position = "identity", alpha = 0.7) +
  facet_wrap(~day_type, scales = "free") +
  labs(title = "Distribution of Airbnb Prices on Weekends and Weekdays by Room Type",
       x = "Price",
       y = "Frequency") +
    theme_minimal() +
  xlim(0, 2000) +
  scale_fill_viridis(discrete = TRUE)
## Warning: Removed 8 rows containing missing values (`geom_bar()`).

ggplot(subset_df, aes(x = realSum, fill = room_type)) +
  geom_histogram(binwidth = 50, position = "dodge", alpha = 0.7) +
  labs(title = "Distribution of Airbnb Prices on Weekends and Weekdays by Room Type",
       x = "Price",
       y = "Frequency") +
  theme_minimal() +
  xlim(0, 2000) +
  scale_fill_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## Warning: Removed 4 rows containing missing values (`geom_bar()`).

From here we can understand: - On weekends for the same price for a private room, there are more bookings. - In general entire home and apartemnts are less in demand on weekends and weekdays than a private room and for the same price the demand of a entire apartment is more on weekends (as is expected)

average_prices <- subset_df %>%
  group_by(room_type, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type'. You can override using the
## `.groups` argument.
ggplot(average_prices, aes(x = room_type, y = avg_price, color = room_type)) +
  geom_point(position = position_dodge(width = 0.5), size = 7) +
  geom_errorbar(
    aes(ymin = avg_price - sd(avg_price), ymax = avg_price + sd(avg_price)),
    position = position_dodge(width = 0.8),
    width = 0.2
  ) +
  labs(title = "Average Price by Room Type",
       x = "Room Type",
       y = "Average Price",
       color = "Room Type") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)

average_prices <- subset_df %>%
  filter(room_type != "Shared room") %>%
  group_by(room_type, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type'. You can override using the
## `.groups` argument.
ggplot(average_prices, aes(x = room_type, y = avg_price, color = room_type)) +
  geom_point(position = position_dodge(width = 0.5), size = 7) +
  geom_errorbar(
    aes(ymin = avg_price - sd(avg_price), ymax = avg_price + sd(avg_price)),
    position = position_dodge(width = 0.8),
    width = 0.2
  ) +
  labs(title = "Average Price by Room Type",
       x = "Room Type",
       y = "Average Price",
       color = "Room Type") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)

average_prices <- subset_df %>%
  group_by(room_type, cleanliness_rating, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'cleanliness_rating'. You can
## override using the `.groups` argument.
ggplot(average_prices, aes(x = cleanliness_rating, y = log(avg_price), color = room_type)) +
  geom_point(position = position_dodge(width = 0.5), size = 3) +
  geom_line(aes(group = room_type), position = position_dodge(width = 0.5), linetype = "dashed") +
  scale_color_viridis_d() +
  labs(title = "Average Price for Each Room Type by Cleanliness Rating",
       x = "Cleanliness Rating",
       y = "Average Price",
       color = "Room Type") +
  theme_minimal() +
  facet_wrap(~day_type)

average_prices_capacity <- subset_df %>%
  group_by(room_type, person_capacity, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'person_capacity'. You can
## override using the `.groups` argument.
ggplot(average_prices_capacity, aes(x = person_capacity, y = log(avg_price), color = room_type)) +
  geom_point(position = position_dodge(width = 0.8), size = 6) +
  geom_line(aes(group = room_type), position = position_dodge(width = 0.5), linetype = "dashed") +
  scale_color_viridis_d() +
  labs(title = "Log Average Price for Each Room Type by Person Capacity",
       x = "Person Capacity",
       y = " Log Average Price",
       color = "Room Type") +
  theme_minimal() +
  facet_wrap(~day_type)

average_prices_capacity <- subset_df %>%
  filter(room_type != "Shared room") %>%
  group_by(room_type, person_capacity, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'person_capacity'. You can
## override using the `.groups` argument.
ggplot(average_prices_capacity, aes(x = person_capacity, y = log(avg_price), color = room_type)) +
  geom_point(position = position_dodge(width = 0.8), size = 6) +
  geom_line(aes(group = room_type), position = position_dodge(width = 0.5), linetype = "dashed") +
  scale_color_viridis_d() +
  labs(title = "Log Average Price for Each Room Type by Person Capacity",
       x = "Person Capacity",
       y = " Log Average Price",
       color = "Room Type") +
  theme_minimal() +
  facet_wrap(~day_type)

average_prices_distance <- subset_df %>%
  group_by(room_type, dist, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'dist'. You can override using
## the `.groups` argument.
ggplot(average_prices_distance, aes(x = dist, y = log(avg_price), color = room_type)) +
  geom_point(position = position_dodge(width = 0.5), size = 3) +
  geom_line(aes(group = room_type), position = position_dodge(width = 0.5), linetype = "dashed") +
  labs(title = "Average Price for Each Room Type by Distance",
       x = "Distance from City Center",
       y = "Average Price",
       color = "Room Type") +
  theme_minimal() +
  facet_grid(room_type ~ day_type)
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals

average_prices_distance <- subset_df %>%
  group_by(room_type, dist, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'dist'. You can override using
## the `.groups` argument.
ggplot(average_prices_distance, aes(x = dist, y = log(avg_price), color = room_type)) +
  geom_point(position = position_dodge(width = 0.5), size = 3) +
  geom_smooth(aes(group = room_type), method = "loess", se = FALSE) +
  scale_color_viridis_d() + # Set the color palette to Viridis
  labs(title = "Average Price for Each Room Type by Distance",
       x = "Distance from City Center",
       y = "Average Price",
       color = "Room Type") +
  theme_minimal() +
  facet_grid(room_type ~ day_type)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals

average_prices_distance <- subset_df %>%
  group_by(room_type, dist, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'dist'. You can override using
## the `.groups` argument.
ggplot(average_prices_distance, aes(x = dist, y = log(avg_price), color = room_type)) +
  geom_point(position = position_dodge(width = 0.5), size = 3, alpha = 0.5) +
  geom_smooth(aes(group = room_type), method = "loess", se = FALSE, color = "black") +  # Set the color of the smooth line to red
  scale_color_viridis_d() +
  labs(title = "Log Average Price for Each Room Type by Distance",
       x = "Distance from City Center",
       y = "Log Average Price",
       color = "Room Type") +
  theme_minimal() +
  facet_grid(room_type ~ day_type)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals

average_prices_distance <- subset_df %>%
  filter(room_type != "Shared room") %>%
  group_by(room_type, dist, day_type) %>%
  summarize(avg_price = mean(realSum))
## `summarise()` has grouped output by 'room_type', 'dist'. You can override using
## the `.groups` argument.
ggplot(average_prices_distance, aes(x = dist, y = log(avg_price), color = room_type)) +
  geom_point(position = position_dodge(width = 0.5), size = 3, alpha = 0.5) +
  geom_smooth(aes(group = room_type), method = "loess", se = FALSE, color = "black") +
  scale_color_viridis_d() +
  labs(title = "Log Average Price for Each Room Type by Distance",
       x = "Distance from City Center",
       y = "Log Average Price",
       color = "Room Type") +
  theme_minimal() +
  facet_grid(room_type ~ day_type)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals

ggplot(subset_df, aes(x = log(dist), y = log(realSum), color = host_is_superhost)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
  labs(title = "Relation between Price and Distance",
       x = "Distance from city center",
       y = "Price",
       color = "Host") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Just Exploring from here:

ggplot(combined, aes(x = cleanliness_rating, y = realSum, color = room_type)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
  labs(title = "Relation between Cleanliness and Price",
       x = "Cleanliness",
       y = "Price",
       color = "Room Type") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.

Obsevations:

  1. Both on weekends and weekdays if the cleanliness is high the price is higher and the airbnbs generally on the higher end will be of the type of entire home.
  2. It is also observed that for private rooms it does not matter much if its a weekday or weekend, the prices more or less reamins the same with cleanliness of 6 and above.
df$bedrooms_cat <- as.factor(df$bedrooms)
ggplot(df, aes(x = cleanliness_rating, y = realSum, color = bedrooms_cat)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
  labs(title = "Relation between Cleanliness and Price",
       x = "Cleanliness",
       y = "Price",
       color = "No. of bed rooms") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.

obs: 1. The highest pricses would be for cleanliness 10 or 8 and aribnbs having bedrooms 0 to 3

ggplot(df, aes(x = bedrooms_cat, y = realSum, color = host_is_superhost))  +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
  labs(title = "Relation between Number of bedrooms and Price",
       x = "Number of bedroooms",
       y = "Price",
       color = "Is host superhost") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Obs: 1. It can be seen that the price of airbnbs with same bedrooms remain more on less same on a the two days. 2. generally on the higher end of prices there are superhosts.

ggplot(df, aes(x = log(dist), y = log(realSum), color = host_is_superhost)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
  labs(title = "Relation between Price and Distance",
       x = "Distance from city center",
       y = "Price",
       color = "Host") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

obs: 1. We can see that in generall on weekdays and weekends as the distance increases the price decreasea for an airbnb 2. On both the types of the week, there are more non superhosts which have airbnbs from 2-10 on an higher price range.

ggplot(df, aes(x = cleanliness_rating, y = realSum, color = host_is_superhost)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
  labs(title = "Relation between Cleanliness and Price",
       x = "Cleanliness",
       y = "Price",
       color = "Host") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.

ggplot(df, aes(x = cleanliness_rating, y = guest_satisfaction_overall)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "auto", se = FALSE, linetype = "dashed") +
  labs(title = "Relation between Cleanliness and Guest Satisfaction",
       x = "Cleanliness",
       y = "Guest Satisfaction",
       color = "Host") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.

ggplot(df, aes(x = cleanliness_rating, y = guest_satisfaction_overall, color = host_is_superhost)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE,) +
  labs(title = "Relation between Cleanliness and Guest Satisfaction",
       x = "Cleanliness",
       y = "Guest Satisfaction",
       color = "Host") +
  theme_minimal() +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~day_type)
## `geom_smooth()` using formula = 'y ~ x'

filtered_df = df[df['realSum'] < 2500,]
#rm(list = (weekdays,weekends, weekend_data))

Modelling the relationship between the distance and the price

dist.model <- lm(realSum ~ dist + person_capacity, data = filtered_df)
summary(dist.model)
## 
## Call:
## lm(formula = realSum ~ dist + person_capacity, data = filtered_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -583.21 -104.63  -35.17   61.68 1912.70 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     157.9209     6.7973   23.23   <2e-16 ***
## dist            -26.7318     0.7523  -35.53   <2e-16 ***
## person_capacity 115.5685     1.6418   70.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 202.2 on 9953 degrees of freedom
## Multiple R-squared:  0.4084, Adjusted R-squared:  0.4083 
## F-statistic:  3436 on 2 and 9953 DF,  p-value: < 2.2e-16
lniear_model <- augment(dist.model,filtered_df)
ggplot(lniear_model, aes(x = dist, y = .resid, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
  scale_color_viridis()+
    ggtitle("Residuals v/s Distance(Linear model)")+
  xlab("Distance")+
  ylab("Residuals")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

From the above residual plot, there is a slight curve for the residual line. We can do better than the existing linear model.

ggplot(lniear_model, aes(x = dist, y = .fitted, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
  scale_color_viridis()+
    ggtitle("Fitted Values v/s Distance(Linear model)")+
  xlab("Distance")+
  ylab("Fitted Values")+ theme_bw()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Fitting a GAM model

dist.gam.model <- gam(realSum ~ s(dist) + person_capacity, data = filtered_df)
summary(dist.gam.model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## realSum ~ s(dist) + person_capacity
## 
## Parametric coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       19.856      5.015   3.959 7.58e-05 ***
## person_capacity  113.983      1.620  70.348  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 8.349  8.886 190.4  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.43   Deviance explained = 43.1%
## GCV =  39431  Scale est. = 39390     n = 9956
gam.df <- augment(dist.gam.model,filtered_df)

Plotting the residuals values for the GAM model

ggplot(gam.df, aes(x = dist, y = .resid, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
    ggtitle("Residuals v/s Distance (GAM model)")+
  xlab("Distance")+
  ylab("Residuals")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(gam.df, aes(x = dist, y = .fitted, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
    ggtitle("Fitted values v/s Distance (GAM model)")+
  xlab("Distance")+
  ylab("Fitted values")+ theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Plotting the log of price v/s log of distance values

ggplot(gam.df, aes(x = log(dist), y = log(realSum), color=person_capacity)) +
  geom_point(size=2) +
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
  scale_color_viridis()+
    ggtitle("Log Price v/s Log Distance")+
  xlab("Distance")+
  ylab("Price")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

rt.model <- lm(realSum ~ dist, data = filtered_df)
summary(rt.model)
## 
## Call:
## lm(formula = realSum ~ dist, data = filtered_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -334.03 -159.37  -62.28   85.80 1962.52 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 518.2529     5.4732   94.69   <2e-16 ***
## dist        -32.7213     0.9148  -35.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 247.5 on 9954 degrees of freedom
## Multiple R-squared:  0.1139, Adjusted R-squared:  0.1138 
## F-statistic:  1279 on 1 and 9954 DF,  p-value: < 2.2e-16
rt.model <- augment(rt.model,filtered_df)

ggplot(rt.model, aes(x = dist, y = .resid, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
  scale_color_viridis()+
    ggtitle("Residuals v/s Distance (Linear model)")+
  xlab("Distance")+
  ylab("Residuals")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(rt.model, aes(x = dist, y = .fitted, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
  scale_color_viridis()+
    ggtitle("Fitted Values v/s Distance (Linear model)")+
  xlab("Distance")+
  ylab("Fitted values")+ theme_bw()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(gam.df, aes(x = dist, y = .fitted, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
    ggtitle("Fitted values v/s Distance")+
  xlab("Distance")+
  ylab("Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

dist.gam.model <- gam(realSum ~ s(dist) , data = filtered_df)
summary(dist.gam.model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## realSum ~ s(dist)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  343.740      2.434   141.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 8.445  8.916 192.5  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.147   Deviance explained = 14.8%
## GCV =  59025  Scale est. = 58969     n = 9956
gam.df <- augment(dist.gam.model,filtered_df)
ggplot(gam.df, aes(x = dist, y = .resid, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
    ggtitle("Residuals v/s Distance")+
  xlab("Distance")+
  ylab("Residuals")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(gam.df, aes(x = dist, y = .fitted, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
    ggtitle("Fitted values v/s Distance(GAM model")+
  xlab("Distance")+
  ylab("Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

modified <- gam(realSum ~ s(dist) + room_type * person_capacity, data = filtered_df)
modified.df <- augment(modified,filtered_df)
ggplot(modified.df, aes(x = dist, y = .resid, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(day_type ~ ., nrow = 1, ncol = 2) + scale_color_viridis()+
    ggtitle("Residuals v/s Distance")+
  xlab("Distance")+
  ylab("Residuals")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(modified.df, aes(x = dist, y = .fitted, color=person_capacity)) +
  geom_point(size=2) + geom_smooth(method = "loess", se=FALSE)+
  facet_wrap(person_capacity~., nrow = 2, ncol = 5) + scale_color_viridis()+
    ggtitle("Fitted values v/s Distance(GAM model")+
  xlab("Distance")+
  ylab("Fitted Values")
## `geom_smooth()` using formula = 'y ~ x'

summary(modified)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## realSum ~ s(dist) + room_type * person_capacity
## 
## Parametric coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            192.500      7.789  24.713  < 2e-16 ***
## room_typePrivate room                  -64.048     11.865  -5.398  6.9e-08 ***
## room_typeShared room                   -58.539     71.688  -0.817   0.4142    
## person_capacity                         85.984      2.011  42.766  < 2e-16 ***
## room_typePrivate room:person_capacity  -45.098      4.320 -10.438  < 2e-16 ***
## room_typeShared room:person_capacity   -62.631     24.841  -2.521   0.0117 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 8.183  8.825 175.7  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.514   Deviance explained = 51.4%
## GCV =  33672  Scale est. = 33624     n = 9956
# ggplot(df, aes(x = as.factor(person_capacity))) +
#   geom_bar() +
#   facet_grid(~ room_type) +
#   labs(title = "Frequency of Person Capacity by Room Type",
#        x = "Person Capacity",
#        y = "Frequency")

ggplot(df, aes(x = as.factor(person_capacity))) +
  geom_bar() +
  facet_grid(room_type ~ day_type) +
  labs(title = "Frequency of Person Capacity by Room Type and Day Type",
       x = "Person Capacity",
       y = "Frequency")

ggplot(df, aes(x = realSum)) +
  geom_density(fill = "blue", alpha = 0.5) +
  facet_grid(~day_type) +
  labs(title = "Smooth Frequency Plot for real_sum",
       x = "real_sum",
       y = "Density")

ggplot(df, aes(x = dist)) +
  geom_density(fill = "blue", alpha = 0.5) +
  facet_grid(~day_type) +
  labs(title = "Smooth Frequency Plot for real_sum",
       x = "real_sum",
       y = "Density")

For equal distribution, we should use log on real sum

d1 = subset_df[subset_df$day_type == "Weekday", ]
model1 <- gam(log(realSum) ~  s(dist) + person_capacity * room_type, data = d1, method = "REML")
summary(model1)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(realSum) ~ s(dist) + person_capacity * room_type
## 
## Parametric coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            5.504885   0.024474 224.929  < 2e-16 ***
## person_capacity                        0.162348   0.006782  23.940  < 2e-16 ***
## room_typePrivate room                 -0.453561   0.035194 -12.887  < 2e-16 ***
## person_capacity:room_typePrivate room -0.075474   0.012825  -5.885 4.28e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 6.392  7.559 127.5  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.668   Deviance explained = 66.9%
## -REML = 1443.7  Scale est. = 0.1121    n = 4368
d2 = subset_df[subset_df$day_type == "Weekend", ]
model2 <- gam(log(realSum) ~  s(dist) + person_capacity * room_type, data = d2, method = "REML")
summary(model2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## log(realSum) ~ s(dist) + person_capacity * room_type
## 
## Parametric coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            5.523447   0.021751 253.942   <2e-16 ***
## person_capacity                        0.153068   0.006083  25.165   <2e-16 ***
## room_typePrivate room                 -0.531955   0.031759 -16.750   <2e-16 ***
## person_capacity:room_typePrivate room -0.026236   0.011672  -2.248   0.0246 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## s(dist) 6.381  7.549 162.6  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.66   Deviance explained = 66.1%
## -REML = 1570.3  Scale est. = 0.10761   n = 5076

Bigger coefficent for distance suggest that distance matters more on weekend when determinig prices.

#residual plot
model1.df <- augment(model1, d1)
ggplot(model1.df, aes(x = dist, y = .resid)) +
  geom_jitter(height = 0.25, width = 0.5) +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

model2.df <- augment(model2, d2)
ggplot(model2.df, aes(x = dist, y = .resid)) +
  geom_jitter(height = 0.25, width = 0.5) +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# head(model1.df)
#fitted plot
ggplot(model1.df, aes(x = dist, y = .fitted, color = person_capacity)) +
  geom_point(size = 1.5, alpha = 0.2) +
  facet_grid(room_type ~ person_capacity, scales = "free")+
  #facet_wrap(~day_type , nrow = 1) +
  scale_color_viridis()+
  ggtitle("Weekday Fitted values based on Distance, room type, and capacity")+
  xlab("Distance")+
  ylab("Fitted Values")+ 
  theme_minimal()

ggplot(model2.df, aes(x = dist, y = .fitted, color = person_capacity)) +
  geom_point(size = 1.5, alpha = 0.2) +
  facet_grid(room_type ~ person_capacity, scales = "free")+
  #facet_wrap(~day_type , nrow = 1) +
  scale_color_viridis()+
  ggtitle("Weekend Fitted values based on Distance, room type, and capacity")+
  xlab("Distance")+
  ylab("Fitted Values")+ 
  theme_minimal()

#homoskedesticity
ggplot(model1.df, aes(x = .fitted, y = abs(.resid))) +
  geom_jitter(width = 0.5) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'